05. Markov chain Monte Carlo

knitr::opts_chunk$set(echo = TRUE, dpi = 300, comment = "#>")

library(glue)
library(ggtext)
library(patchwork)
suppressPackageStartupMessages(library(tidyverse))

theme_set(theme_bw())

Resources

Notes

Reading instructions

Chapter 11. Basics of Markov chain simulation

Introduction

11.1 Gibbs sampler

\[\begin{equation} \begin{pmatrix} \theta_1 \\ \theta_2 \end{pmatrix} | y \sim \text{N} \begin{pmatrix} \begin{pmatrix} y_1 \\ y_2 \end{pmatrix}, \begin{pmatrix} 1 & \rho \\ \rho & 1 \\ \end{pmatrix} \end{pmatrix} \tag{1} \end{equation}\]

\[\begin{align} \begin{split} \theta_1 | \theta_2, y &\sim \text{N}(y_1 + \rho (\theta_2 - y_2), 1 - \rho^2) \\ \theta_2 | \theta_1, y &\sim \text{N}(y_2 + \rho (\theta_1 - y_1), 1 - \rho^2) \end{split} \tag{2} \end{align}\]

chain_to_df <- function(chain, names) {
  purrr::map_dfr(chain, ~ as.data.frame(t(.x))) %>%
    tibble::as_tibble() %>%
    purrr::set_names(names)
}

# Run a single chain of a Gibbs sampler for a bivariate normal distribution.
gibbs_sample_demo <- function(data, rho, theta_t0, N = 100) {
  theta_1 <- theta_t0[[1]]
  theta_2 <- theta_t0[[2]]
  y1 <- data[[1]]
  y2 <- data[[2]]

  chain <- as.list(rep(theta_t0, n = (2 * N) + 1))
  chain[[1]] <- c(theta_1, theta_2, 1)

  for (t in seq(2, N)) {
    theta_1 <- rnorm(1, y1 + rho * (theta_2 - y2), 1 - rho^2)
    chain[[2 * (t - 1)]] <- c(theta_1, theta_2, t)
    theta_2 <- rnorm(1, y2 + rho * (theta_1 - y1), 1 - rho^2)
    chain[[2 * (t - 1) + 1]] <- c(theta_1, theta_2, t)
  }

  chain_df <- chain_to_df(chain, names = c("theta_1", "theta_2", "t"))
  return(chain_df)
}


rho <- 0.8
y <- c(0, 0)
starting_points <- list(
  c(-2.5, -2.5), c(2.5, -2.5), c(-2.5, 2.5), c(2.5, 2.5)
)

set.seed(0)
gibbs_demo_chains <- purrr::map_dfr(
  seq(1, 4),
  ~ gibbs_sample_demo(y, rho, starting_points[[.x]]) %>%
    add_column(chain = as.character(.x))
)

plot_chains <- function(chain_df, x = theta_1, y = theta_2, color = chain) {
  chain_df %>%
    ggplot(aes(x = {{ x }}, y = {{ y }}, color = {{ color }})) +
    geom_path(alpha = 0.6, show.legend = FALSE) +
    scale_color_brewer(type = "qual", palette = "Set1")
}

plot_points <- function(chain_df, x = theta_1, y = theta_2, color = chain) {
  chain_df %>%
    ggplot(aes(x = {{ x }}, y = {{ y }}, color = {{ color }})) +
    geom_point(size = 0.75, alpha = 0.75) +
    scale_color_brewer(type = "qual", palette = "Set1")
}

theta_axis_labs <- function(p) {
  p +
    theme(
      axis.title.x = element_markdown(),
      axis.title.y = element_markdown()
    ) +
    labs(x = "θ<sub>1</sub>", y = "θ<sub>2</sub>")
}

gibbs_plot_chains <- plot_chains(gibbs_demo_chains) %>%
  theta_axis_labs()

gibbs_plot_points <- gibbs_demo_chains %>%
  group_by(chain, t) %>%
  slice_tail(n = 1) %>%
  ungroup() %>%
  plot_points() %>%
  theta_axis_labs()

(gibbs_plot_chains | gibbs_plot_points) + plot_annotation(title = "Gibbs sampler")

11.2 Metropolis and Metropolis-Hastings algorithms

The Metropolis algorithm
calc_metropolis_density_ratio <- function(t_star, t_m1, data, prior_cov_mat) {
  numerator <- mvtnorm::dmvnorm(t_star, data, prior_cov_mat)
  denominator <- mvtnorm::dmvnorm(t_m1, data, prior_cov_mat)
  return(numerator / denominator)
}

metropolis_algorithm_demo <- function(data, theta_t0, N = 1000, quiet = FALSE) {
  theta_t <- unlist(theta_t0)
  prior_dist_mu <- data
  prior_dist_cov_mat <- matrix(c(1, 0, 0, 1), nrow = 2)
  jumping_dist_cov_mat <- prior_dist_cov_mat * 0.2^2

  chain <- as.list(rep(NA_real_, n = N + 1))
  chain[[1]] <- theta_t

  n_accepts <- 0

  for (t in seq(2, N + 1)) {
    theta_star <- mvtnorm::rmvnorm(
      n = 1, mean = theta_t, sigma = jumping_dist_cov_mat
    )[1, ]

    density_ratio <- calc_metropolis_density_ratio(
      t_star = theta_star,
      t_m1 = theta_t,
      data = data,
      prior_cov_mat = prior_dist_cov_mat
    )

    accept <- runif(1) < min(c(1, density_ratio))
    if (accept) {
      theta_t <- theta_star
      n_accepts <- n_accepts + 1
    }
    chain[[t]] <- theta_t
  }
  if (!quiet) {
    frac_accepts <- n_accepts / N
    message(glue("fraction of accepted jumps: {frac_accepts}"))
  }
  return(chain_to_df(chain, names = c("theta_1", "theta_2")))
}

set.seed(0)
metropolis_chains <- purrr::map_dfr(
  seq(1, 4),
  ~ metropolis_algorithm_demo(c(0, 0), starting_points[[.x]]) %>%
    add_column(chain = as.character(.x))
)
#> fraction of accepted jumps: 0.888
#> fraction of accepted jumps: 0.868
#> fraction of accepted jumps: 0.914
#> fraction of accepted jumps: 0.869
metropolis_plot_chains <- plot_chains(metropolis_chains) %>%
  theta_axis_labs()
metropolis_plot_points <- metropolis_chains %>%
  group_by(chain) %>%
  slice_tail(n = 500) %>%
  plot_points() %>%
  theta_axis_labs()

(metropolis_plot_chains | metropolis_plot_points) + plot_annotation(title = "Metropolis algorithm")

The Metropolis-Hastings algorithm

\[\begin{equation} r = \frac{p(\theta^* | y) / J_t(\theta^* | \theta^{t-1})}{p(\theta^{t-1} | y) / J_t(\theta^{t-1} | \theta^*)} \tag{3} \end{equation}\]

11.2 Metropolis and Metropolis-Hastings algorithms

Difficulties of inference from iterative simulation
Discarding early iterations of the simulation runs
Dependence of the iterations in each sequence
Multiple sequences with overdispersed starting points
Monitoring scalar estimands
Challenges of monitoring convergence: missing and stationarity
Splitting each saved sequence into two parts
Assessing mising using between- and within-sequence variances

\[\begin{equation} B = \frac{n}{m-1} \sum_j^m (\bar{\psi}_{.j} - \bar{\psi}_{..})^2 \\ \quad \text{where} \quad \bar{\psi}_{.j} = \frac{1}{n} \sum _i^n \psi_{ij} \quad \text{and} \quad \bar{\psi}_{..} = \frac{1}{m} \sum_j^m \bar{\psi}_{.j} \tag{4} \end{equation}\]

\[\begin{equation} W = \frac{1}{m} \sum_j^m s_j^2 \quad \text{where} \quad s_j^2 = \frac{1}{n-1} \sum_i^n (\psi_ij - \bar{\psi}_{.j})^2 \tag{5} \end{equation}\]

\[\begin{equation} \widehat{\text{var}}^+(\psi|y) = \frac{n-1}{n}W + \frac{1}{n}B \tag{6} \end{equation}\]

11.5 Effective number of simulation draws

Lecture notes

5.1. Markov chain Monte Carlo, Gibbs sampling, Metropolis algorithm

5.2. Warm-up, convergence diagnostics, R-hat, and effective sample size

Rhat-with-few-draws

Rhat-with-many-draws - update \(\widehat{R}\) is rank normalized \(\widehat{R}\) - original \(\widehat{R}\) requires that the target distribution has finite mean and variance - rank normalized removes this requirement - improved detection of different scales between chains - the paper also proposes local convergence diagnostics and practical MCSE estimates for quantiles - autocorrelation in the chains (think of as a time series analysis) - describes the correlation given a certain lag - how many steps does it take for the chain to forget a previous step - can be used to compare efficiency of MCMC algorithms and parameterizations - in the example below, the autocorrelation plot shows that it takes about 40 steps to reach a correlation of 0 - the x-axis should be “lag”

autocorrelation-example - calculating autocorrelation function - \(\hat{\rho}_{n,m}\) is the autocorrelation at lag \(n\) for chain \(m\) of \(M\) chains - can see the use of \(W\) and \(\widehat{\text{var}}^+\) from the calculation of \(\widehat{R}\) so that is accounts for how well the chains mix

\[ \hat{\rho}_n = 1 - \frac{W - \frac{1}{M} \sum_m^M \hat{\rho}_{n,m}}{2 \widehat{\text{var}}^+} \]